In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.image as mpimg

bank client data:¶

1 - age (numeric)
2 - job : type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')
3 - marital : marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)
4 - education (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')
5 - default: has credit in default? (categorical: 'no','yes','unknown')
6 - housing: has housing loan? (categorical: 'no','yes','unknown')
7 - loan: has personal loan? (categorical: 'no','yes','unknown')
# related with the last contact of the current campaign:
8 - contact: contact communication type (categorical: 'cellular','telephone')
9 - month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')
10 - day_of_week: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')
11 - duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
# other attributes:
12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
14 - previous: number of contacts performed before this campaign and for this client (numeric)
15 - poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')
# social and economic context attributes
16 - emp.var.rate: employment variation rate - quarterly indicator (numeric)
17 - cons.price.idx: consumer price index - monthly indicator (numeric)
18 - cons.conf.idx: consumer confidence index - monthly indicator (numeric)
19 - euribor3m: euribor 3 month rate - daily indicator (numeric)
20 - nr.employed: number of employees - quarterly indicator (numeric)
21 - y - has the client subscribed a term deposit? (binary: 'yes','no')

Exploratory Data Analysis¶

In [2]:
df = pd.read_csv("../DATA/bank-full.csv")
In [3]:
#based on domain experiance , for marketing purposes we may need to cluster clients to 3 groups :
#took a loan,didn't take a loan ,unknown .
plt.figure(figsize=(12,6),dpi=200)
sns.histplot(data=df,x='age',hue='loan')
Out[3]:
<AxesSubplot:xlabel='age', ylabel='Count'>
In [4]:
# pdays: number of days that passed by after the client was last contacted from a previous campaign
# (numeric; 999 means client was not previously contacted)

plt.figure(figsize=(12,6),dpi=200)
sns.histplot(data=df[df['pdays']!=999],x='pdays')
Out[4]:
<AxesSubplot:xlabel='pdays', ylabel='Count'>
In [5]:
# 1000 is 16 minutes
plt.figure(figsize=(12,6),dpi=200)
sns.histplot(data=df,x='duration',hue='contact')
Out[5]:
<AxesSubplot:xlabel='duration', ylabel='Count'>
In [6]:
plt.figure(figsize=(12,6),dpi=200)

sns.countplot(data=df,x='education',order=df['education'].value_counts().index)
plt.xticks(rotation=90);

Trainning The Model¶

we can't use categorical data, K mean clustering is distance based algorithim -> use dummy variables.

we need to scale the data , k mean clustering is distance based algorithim -> scale the features

In [7]:
X=pd.get_dummies(df)
In [8]:
from sklearn.preprocessing import StandardScaler 
In [9]:
scaler= StandardScaler()
In [10]:
# No data leakage as we do'nt have our label 
X=scaler.fit_transform(X)
In [11]:
from sklearn.cluster import KMeans
In [12]:
model=KMeans(n_clusters=2)
In [13]:
# 1- will fit all features to the model to find the cluster centers 
# 2- we will predict the features would belong to which cluster 
# Note if you run the cell more thean once you will get 0,0,0,,...,1,1,1
# the zeros and ones are meaningless they're just labels of the clusters
#as what matters that the same rows belong to the same clusters
cluster_labels=model.fit_predict(X)
cluster_labels
Out[13]:
array([1, 1, 1, ..., 0, 0, 0])
In [14]:
df=pd.get_dummies(df)
df['Cluster'] =cluster_labels
In [15]:
plt.figure(figsize=(12,6),dpi=200)
df.corr()['Cluster'].iloc[:-1].sort_values().plot(kind='bar')
Out[15]:
<AxesSubplot:>

Determining K value using elbow method¶

In [16]:
ssd = []

for k in range(2,10):
    
    model = KMeans(n_clusters=k)
    
    
    model.fit(X)
    
    #Sum of squared distances of samples to their closest cluster center.
    ssd.append(model.inertia_)
In [17]:
ssd
Out[17]:
[2469792.3616627543,
 2370786.446603645,
 2271502.8081971155,
 2228290.0533834356,
 2157695.015264023,
 2074338.1385483479,
 2076251.5749846818,
 1995548.640403869]
In [18]:
plt.plot(range(2,10),ssd,'o--')
plt.xlabel("K Value")
plt.ylabel(" Sum of Squared Distances")
Out[18]:
Text(0, 0.5, ' Sum of Squared Distances')
In [19]:
# Change in SSD from previous K value!
pd.Series(ssd).diff()
Out[19]:
0             NaN
1   -99005.915059
2   -99283.638407
3   -43212.754814
4   -70595.038119
5   -83356.876716
6     1913.436436
7   -80702.934581
dtype: float64

--------------------------------------------------------------------------------------------------------------¶

Color Quantization¶

In [20]:
image_as_array=mpimg.imread("Hadeel's_me_time.jpg")
In [21]:
# (R,G,B)
image_as_array
Out[21]:
array([[[174, 153, 168],
        [175, 154, 169],
        [175, 154, 169],
        ...,
        [ 95,  96, 101],
        [ 96,  96, 104],
        [ 96,  96, 104]],

       [[175, 157, 171],
        [175, 157, 171],
        [176, 158, 172],
        ...,
        [ 95,  96, 101],
        [ 95,  95, 103],
        [ 95,  95, 103]],

       [[178, 160, 174],
        [179, 161, 175],
        [180, 162, 176],
        ...,
        [ 96,  97, 102],
        [ 95,  95, 103],
        [ 95,  95, 103]],

       ...,

       [[172, 127, 150],
        [172, 127, 150],
        [172, 127, 150],
        ...,
        [201, 225, 225],
        [201, 225, 225],
        [201, 225, 225]],

       [[172, 127, 150],
        [172, 127, 150],
        [172, 127, 150],
        ...,
        [202, 226, 226],
        [200, 224, 224],
        [200, 224, 224]],

       [[172, 127, 150],
        [171, 126, 149],
        [171, 126, 149],
        ...,
        [202, 226, 226],
        [199, 223, 223],
        [199, 223, 223]]], dtype=uint8)
In [22]:
plt.figure(figsize=(6,6),dpi=100)
plt.imshow(image_as_array)
Out[22]:
<matplotlib.image.AxesImage at 0x2547155cac8>

Convert from 3d to 2d Array¶

In [23]:
# 3d array => 1280 width , 1162 height , 3 channels

# First Dimension (Height - h) → Represents the number of rows of pixels.

# Second Dimension (Width - w) → Represents the number of columns of pixels.

# Third Dimension (Channels - c) → Represents color information (typically 3 for RGB images).

(h,w,rgb)=image_as_array.shape
(h,w,rgb)
Out[23]:
(1280, 1162, 3)
In [24]:
# 2D array => height*width , 3 channels 
#Rows → Each row corresponds to a single pixel in the original image
#Columns → Each column stores the RGB values for that pixel.
images_as_2d_array = image_as_array.reshape(h*w,rgb)
In [25]:
images_as_2d_array.shape
Out[25]:
(1487360, 3)

Trainning K mean clustering¶

Kmeans is designed to train on 2D data (data rows and feature columns), so we can reshape the above strip by using (h,w,c) ---> (h * w,c)

In [26]:
from sklearn.cluster import KMeans
In [27]:
model= KMeans(n_clusters=20)
label=model.fit_predict(images_as_2d_array)
In [28]:
# 10 colors with rgb code
model.cluster_centers_
Out[28]:
array([[179.08497623, 156.92511884, 144.75479109],
       [219.02596897, 221.15505002, 214.22445037],
       [104.81497285,  91.95223318,  83.54635376],
       [189.74249677, 172.71689254, 165.89847001],
       [181.18202613, 216.36212161, 228.68702586],
       [227.59565859, 220.00657499, 160.73084914],
       [167.62981414, 122.98687488, 145.0799323 ],
       [206.66526934, 207.06678184, 199.68490644],
       [217.50141054, 208.61443746, 148.80899082],
       [208.69399195, 153.46626657, 103.00016488],
       [ 39.04528227,  37.31329971,  36.27710809],
       [136.46617339, 111.22182378, 107.30387131],
       [132.0054727 , 153.92413463, 167.36051443],
       [185.50186031, 145.66167307, 164.82556187],
       [159.0311138 , 195.61132216, 213.05045797],
       [194.92928672, 229.25751486, 239.88826036],
       [159.31620303, 139.733546  , 122.44314397],
       [ 72.52850016,  69.40758237,  67.08604276],
       [194.91417204, 193.64641021, 189.2989296 ],
       [180.01698129, 191.71842945, 138.31592104]])
In [29]:
label
Out[29]:
array([13, 13, 13, ..., 15, 15, 15])
In [30]:
rgb_codes = model.cluster_centers_.round(0).astype(int)
quantized_image = np.reshape(rgb_codes[label], (h, w, rgb))
In [31]:
plt.figure(figsize=(6,6),dpi=100)
plt.imshow(quantized_image)
Out[31]:
<matplotlib.image.AxesImage at 0x254715d9748>